Findings

library(tidyverse)
## ── Attaching packages ───────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

## Load data
load('01_parsed.Rdata')

Finding: 25% of programs account for about 50% of (permanent) placements

ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) + 
    geom_step() +
    scale_x_continuous(labels = scales::percent_format(), 
                       name = 'PhD Programs') +
    scale_y_continuous(labels = scales::percent_format(), 
                       name = 'Permanent Placements')
## Warning: Removed 20 rows containing missing values (geom_path).

Build network

## NB Only permanent placements
hiring_network = individual_df %>%
    filter(permanent) %>%
    select(placing_univ_id, hiring_univ_id, everything()) %>%
    graph_from_data_frame(directed = TRUE, 
                          vertices = univ_df)

## 1 giant component contains almost all of the schools
components(hiring_network)$csize
##   [1]   1   1   1   1   1 786   1   1   2   1   1   1   1   1   1   1   1
##  [18]   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [35]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [52]   1   1   1   1   1   1   4   1   1   1   1   1   1   1   1   1   1
##  [69]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [86]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [103]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [120]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [137]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [154]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [171]   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [188]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [205]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [222]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [239]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [256]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [273]   1   1   1   1   1   1   1

Out- and in-centrality

set.seed(42)
V(hiring_network)$in_centrality = eigen_centrality(hiring_network, 
                                                   directed = TRUE)$vector
graph.reverse <- function (graph) {
    if (!is.directed(graph))
        return(graph)
    e <- get.data.frame(graph, what="edges")
    ## swap "from" & "to"
    neworder <- 1:length(e)
    neworder[1:2] <- c(2,1)
    e <- e[neworder]
    names(e) <- names(e)[neworder]
    graph.data.frame(e, vertices = get.data.frame(graph, what="vertices"))
}
V(hiring_network)$out_centrality = hiring_network %>%
    graph.reverse() %>%
    eigen_centrality(directed = TRUE) %>%
    .$vector

## Add centrality statistics to the university df
univ_df = univ_df %>%
    mutate(out_centrality = V(hiring_network)[univ_df$univ_id]$out_centrality, 
           in_centrality = V(hiring_network)[univ_df$univ_id]$in_centrality)

Exploring centrality scores

There are two clear groups of centrality scores, with high scores (in the range of 10^-5 to 1) and low scores (10^-15 and smaller).

##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(univ_df, aes(out_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10')
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_density).

ggplot(univ_df, aes(in_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10')
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing non-finite values (stat_density).

ggplot(univ_df, aes(out_centrality, in_centrality, 
                    color = cluster, 
                    text = univ_name)) +
    geom_jitter() + 
    scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

Finding: High output is only modestly correlated w/ centrality. Programs like ND, CUNY, New School produce lots of PhDs, but they aren’t placed into the high-centrality departments.

ggplot(univ_df, aes(total_placements, log10(out_centrality)
                    )) +
    geom_point(aes(text = univ_name))
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 913 rows containing missing values (geom_point).

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
univ_df %>%
    mutate(out_centrality_log = log10(out_centrality)) %>%
    filter(!is.na(out_centrality_log) & 
               out_centrality > 0 &
               !is.na(total_placements)) %>%
    select(total_placements, out_centrality_log) %>%
    cor()
##                    total_placements out_centrality_log
## total_placements          1.0000000          0.3614052
## out_centrality_log        0.3614052          1.0000000
ggplot(univ_df, aes(perm_placement_rank, log10(out_centrality))) +
    geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).

ggplot(univ_df, aes(perm_placement_rate, log10(out_centrality), 
                    text = univ_name)) + 
    geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## While individuals can move from low to high centrality in temporary positions, this never happens with permanent positions.  However, this is expected from the way centrality is calculated.  
# dataf %>%
#     # filter(permanent) %>%
#     left_join(select(univ_df, univ_id, out_centrality), 
#           by = c('placing_univ_id' = 'univ_id')) %>%
#     left_join(select(univ_df, univ_id, out_centrality), 
#               by = c('hiring_univ_id' = 'univ_id')) %>%
#     ggplot(aes(log10(out_centrality.x), 
#                log10(out_centrality.y))) + 
#     geom_point() + 
#     xlab('Placing University Centrality') +
#     ylab('Hiring University Centrality') +
#     stat_function(fun = function (x) x, linetype = 'dashed') +
#     facet_grid(~ permanent)

Community detection

## steps = 26 has low values for both entropy of cluster sizes (high delta H) and total # clusters
## but somewhere along the way this started producing >300 clusters? 
## bc ~300 connected components
## Extract giant component
hiring_network_gc = hiring_network %>%
    components %>% 
    .$csize %>%
    {which(. == max(.))} %>%
    {components(hiring_network)$membership == .} %>%
    which() %>%
    induced_subgraph(hiring_network, .)

# walk_len = rep(2:100, 1)
# # ## NB Takes a minute or so
# cluster_stats = walk_len %>%
#   map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
#   map(~ list(sizes = sizes(.x), length = length(.x))) %>%
#   map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
#                    n_clusters = .x$length)) %>%
#   mutate(walk_len = as.factor(walk_len),
#          delta_H = log2(n_clusters) - H)
# 
# ggplot(cluster_stats, aes(walk_len, delta_H)) +
#     geom_point() +
#     coord_flip()
# ggplot(cluster_stats, aes(walk_len, n_clusters)) +
#     geom_point() +
#     coord_flip()
# ggplot(cluster_stats, aes(n_clusters, delta_H)) +
#   geom_text(aes(label = walk_len)) +
#   scale_x_continuous()


communities = cluster_walktrap(hiring_network_gc, steps = 26)
V(hiring_network_gc)$community = membership(communities)
univ_df = univ_df %>%
    left_join({hiring_network_gc %>%
            as_data_frame(what = 'vertices') %>% 
            select(univ_id = name, community) %>%
            mutate(community = as.character(community))})
## Joining, by = "univ_id"

Finding: There is no correlation between semantic clusters and topological communities.

univ_df %>%
    filter(!is.na(community), !is.na(cluster)) %>%
    select(community, cluster) %>%
    table() %>%
    chisq.test(simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  .
## X-squared = 219.96, df = NA, p-value = 0.2619
univ_df %>%
    filter(!is.na(community), !is.na(cluster)) %>%
    count(community, cluster) %>%
    rename(cluster_n = n) %>%
    group_by(community) %>%
    mutate(community_tot = sum(cluster_n), 
           cluster_frac = cluster_n / community_tot, 
           H = sum(cluster_frac * log2(cluster_frac))) %>%
    ggplot(aes(fct_reorder(community, H, .desc = TRUE),
               cluster_n, fill = cluster)) + 
    geom_col() + 
    coord_flip() +
    xlab('Topological communities') +
    ylab('# of schools in community') +
    scale_fill_brewer(palette = 'Set1', 
                       name = 'Semantic\nclusters')

Core elite universities

## Start w/ Oxford, and work upstream
## Only need to go 8 steps to get closure
1:25 %>%
    map(~ make_ego_graph(hiring_network, order = .x, 
                         nodes = '512', mode = 'in')) %>%
    flatten() %>%
    map(~ length(V(.x))) %>%
    tibble(order = 1:length(.), 
           size = unlist(.)) %>%
    ggplot(aes(order, size)) + geom_point()

elites = make_ego_graph(hiring_network, order = 8, 
                        nodes = '512', 
                        mode = 'in') %>%
    .[[1]]

## How large is the elite community?  
## 61 programs; 6% of all programs in the network; 
## 39% of programs with at least 1 placement in the dataset
length(V(elites))
## [1] 61
length(V(elites)) / length(V(hiring_network))
## [1] 0.05700935
length(V(elites)) / sum(!is.na(univ_df$total_placements))
## [1] 0.388535
## What fraction of hires are within elites?  
## 15% of all permanent hires; 7% of all hires
length(E(elites)) / length(E(hiring_network))
## [1] 0.1542219
length(E(elites)) / nrow(individual_df)
## [1] 0.07828551
# png(file = '02_elite_net.png', 
#     width = 11, height = 11, units = 'in', res = 400)
ggraph(elites) + 
    geom_node_label(aes(label = univ_name, 
                        size = log10(out_centrality))) + 
    geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')), 
                  alpha = .25,
                  spread = 5) +
    scale_size_continuous(range = c(.5, 3)) +
    theme_graph()
## Using `nicely` as default layout

# dev.off()

## "Elite" status = high centrality group
univ_df = univ_df %>%
    mutate(elite = univ_id %in% V(elites)$name)

ggplot(univ_df, aes(elite, log10(out_centrality))) + 
    geom_jitter()

## "Elite" status and clusters
## Elites basically don't appear in clusters 2, 3, 7
univ_df %>%
    filter(!is.na(cluster)) %>%
    ggplot(aes(cluster, color = elite)) + 
    geom_point(stat = 'count') +
    geom_line(aes(group = elite), stat = 'count')

## Elites are mostly confined to the 3 large communities
univ_df %>%
    filter(!is.na(community)) %>%
    ggplot(aes(as.integer(community), fill = elite)) + 
    geom_bar()

## What fraction of elite graduates end up in elite programs? 
## 221 / (221 + 569) = 28% of those w/ permanent placements
individual_df %>%
    filter(permanent) %>%
    left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
    select(elite.x, elite.y) %>%
    table()
##        elite.y
## elite.x FALSE TRUE
##   FALSE   643    0
##   TRUE    569  221

Finding: Median permanent placement rate for elite programs is 14 points higher than for non-elite programs. However, variation is also wide within each group; even among elite programs, median permanent placement rate is only 58%. This is also not yet controlling for graduation year, area, or demographics.

ggplot(univ_df, aes(elite, perm_placement_rate, 
                    label = univ_name)) + 
    geom_boxplot(color = 'red') +
    geom_jitter() +
    scale_y_continuous(labels = scales::percent_format())
## Warning: Removed 913 rows containing non-finite values (stat_boxplot).
## Warning: Removed 913 rows containing missing values (geom_point).

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Removed 913 rows containing non-finite values (stat_boxplot).

Plotting

Coarser community structure

large_clusters = which(sizes(communities) > 100)
V(hiring_network_gc)$community_coarse = ifelse(
    V(hiring_network_gc)$community %in% large_clusters, 
    V(hiring_network_gc)$community, 
    'other')

FR

hiring_network_gc %>%
    induced_subgraph(which(degree(., mode = 'out') > 10)) %>%
    ggraph() +
    # geom_node_label(aes(label = univ_name, 
    #                     size = log10(1 + out_centrality), 
    #                     color = as.factor(community))) +
    geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')), 
                  spread = 5) +
    geom_node_point(aes(size = log10(1 + out_centrality), 
                        # color = as.factor(community))) +
                        color = community_coarse)) +
    scale_color_brewer(palette = 'Set1', guide = FALSE) +
    scale_size(range = c(2, 6)) +
    theme_graph()
## Using `nicely` as default layout

Chord diagram

hiring_network_gc %>%
    induced_subgraph(which(degree(., mode = 'out') > 1)) %>%
    ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
    geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
    geom_node_point(aes(size = log10(1 + out_centrality), 
                        # color = as.factor(community))) +
                        color = community_coarse)) +
    scale_color_brewer(palette = 'Set1', guide = FALSE) +
    theme_graph()

## Save university-level data with network statistics
save(univ_df, file = '02_univ_net_stats.Rdata')